/*******************************************************************************
********************************************************************************

Replication code for:
	Decomposing the Wedge Between Projected and Realized Returns 
	in Energy Efficiency Programs

Peter Christensen, Paul Francisco, Erica Myers, and Mateus Souza

For inquiries, please contact Mateus Souza: mateusmeirelles@gmail.com

********************************************************************************
*******************************************************************************/




********************************************************************************
****************************** START OF PREAMBLE *******************************

clear all
set more off, perm

** Set paths for data and programs
gl maindir "C:/Users/mnogueir/Dropbox/IHWAP/REStat_Replication" /* Where code, data and results are stored. */
gl rpath "C:/Program Files/R/R-3.6.3/bin/Rscript.exe" /* Where R is installed. Note version R-3.6.3 is required. */
gl rpackages "C:/Users/mnogueir/Dropbox/IHWAP/REStat_Replication/Code/RPackages" /* where R packages are installed */

** Clean and set memory
set maxvar 30000
set matsize 11000
reghdfe, compile

/*

********** IMPORTANT PACKAGE INSTALLATIONS:

****** Install Stata Packages

*** Stata scripts assume that the following packages are installed:
* todate winsor2 reghdfe ftools moremata ds3 ivreghdfe 
*	ivreg2 ranktest rscript labutil distinct psacalc 
*	st0085_2 gr0059_1 st0043_2 st0035_1

* The Stata package versions can be found in $maindir/Code/StataPackages
* These can be installed by running:
do "$maindir/Code/install_packages_Stata.do"


****** Install R Packages

*** Machine Learning algorithms use R version R-3.6.3 

*** R scripts assume that the following packages are installed:
* "nnls", "SuperLearner", "dplyr", "arm", "onehot", "grf", "xgboost", "caret", "data.table",
* "knitr", "kableExtra", "magrittr", "kimisc", "data.table", "readstata13", "DescTools", "scales", "ggplot2"

* The R package versions can be found in $maindir/Code/RPackages
* These can be installed by running:
rscript using "$maindir/Code/install_packages_R.R", args("$maindir" "$rpackages") rpath("$rpath")

*/


******************************* END OF PREAMBLE ********************************
********************************************************************************




********************************************************************************
*************************** MACHINE LEARNING ***********************************

***** PRE-PROCESSING FOR ML
*** pre-processing for tree based algorithms
rscript using "$maindir/Code/preprocess_ml.R", args("$maindir" "$rpackages") rpath("$rpath")
* Outputs: ihwap_full.csv

*** pre-processing for other algorithms (which need to pre-define interactions)
* In addition to the R script above, also run:
do "$maindir/Code/preprocess_ml_flex.do"
* Outputs: ihwap_full_flex.csv


*********** WARNING: THIS PART OF THE CODE TAKES VERY LONG TO RUN **************

/* 
This portion of the code is commented out because it is very computationally 
demanding. It could take more than 200 hours, depending on memory configurations.
Code and outputs from the machine learning step are nevertheless provided for replication. 
*/

/* DELETE THIS LINE TO RUN ML TUNING

***** Hyperparameter tuning for different algorithms
*** hyperparameter tuning for XGBoost
rscript using "$maindir/Code/tuning_xgboost.R", args("$maindir" "$rpackages") rpath("$rpath")
* Outputs: predictpre_model_xgboost.rds; predictpre_results_xgboost.csv; predictpost_results_xgboost.csv

*** hyperparameter tuning for other algorithms (Random Forest and Elastic Net)
rscript using "$maindir/Code/tuning_rf.R", args("$maindir" "$rpackages") rpath("$rpath")
* Outputs: predictpre_model_rf.rds; predictpre_results_rf.csv; predictpost_results_rf.csv
rscript using "$maindir/Code/tuning_elastnet.R", args("$maindir" "$rpackages") rpath("$rpath")
* Outputs: predictpre_model_elastnet.rds; predictpre_results_elastnet.csv; predictpost_results_elastnet.csv
*/

***** Running the best-performing model
*** full sample run
rscript using "$maindir/Code/ml_best_model.R", args("$maindir" "$rpackages") rpath("$rpath")
* Outputs: predictpre_model_best.rds; CV_predictpre_best.csv

/* DELETE THIS LINE TO RUN ML BOOTSTRAPPING
***** Bootstrapping /* WARNING: TAKES VERY LONG TO RUN */
*** generate 200 bootstrap samples
rscript using "$maindir/Code/preprocess_bootstrap.R", args("$maindir" "$rpackages") rpath("$rpath")
* Outputs: 200 files ihwap_boots'number'.csv

*** run the best model for the 200 bootstrap samples
forval boots = 1/200 {
	di "Running model for bootstrap sample `boots' ..."
	
	rscript using "$maindir/Code/predict_bootstrap.R", args("$maindir" "$rpackages" "`boots'") rpath("$rpath")
	
	di "Finished model for bootstrap sample `boots'."	
}
* Outputs: 200 files predpre_ihwap_boots'number'.csv

*/
****************************** END OF WARNING **********************************

************************* END OF MACHINE LEARNING ******************************
********************************************************************************




********************************************************************************
**************************** PRISM PER HOME ************************************

/* DELETE THIS LINE TO RUN PRISM PER HOME
*********** WARNING: THIS PART OF THE CODE TAKES VERY LONG TO RUN **************

/* Since we need to iterate over several models for each home, this part of the 
code also takes long to run. We are providing the data, code, and outputs for 
replication purposes. */

do "$maindir/Code/prism_perhome.do"
* Outputs: prism_gas.dta

/***** Finally, merge all ML results, merge with PRISM results,
	plus some pre-processing, and merging with more info from ihwap_state.dta */
do "$maindir/Code/combine_ml.do"
* Outputs: ihwap_wedge1.dta; ihwap_wedge2.dta; ihwap_wedge3.dta; ihwap_wedge4.dta
* Note: dataset was split into 4 parts to circumvent file size limits from repository

*/
****************************** END OF WARNING **********************************

************************** END OF PRISM PER HOME *******************************
********************************************************************************




********************************************************************************
**************************** PREPARING DATA ************************************

***** Appending files that ocnstitute the main dataset of the study
clear all
use "$maindir/Results/Model_Outputs/ihwap_wedge1.dta"
append using "$maindir/Results/Model_Outputs/ihwap_wedge2.dta"
append using "$maindir/Results/Model_Outputs/ihwap_wedge3.dta"
append using "$maindir/Results/Model_Outputs/ihwap_wedge4.dta"

*** This is the main dataset of the study:
save "$maindir/Results/Model_Outputs/ihwap_wedge.dta", replace

************************** END OF PREPARING DATA *******************************
********************************************************************************




********************************************************************************
************************** DESCRIPTIVE STATISTICS ******************************

***** Descriptives statistics for study sample
do "$maindir/Code/descriptives.do"
* Outputs:
* Figure 1: ratio_totalbins.pdf
* Appendix Table A.1: descriptives.tex

*********************** END OF DESCRIPTIVE STATISTICS **************************
********************************************************************************




********************************************************************************
**************************** ML PERFORMANCE ************************************

***** Details on performance of machine learning algorithms
*** Assess performance of models with different hyperparameter configurations
do "$maindir/Code/assess_tuning.do"
* Outputs: 
* Appendix Table B.1: xgboost_results.tex
* Appendix Table B.2: randomforest_results.tex
* Appendix Table B.3: elasticnet_results.tex

*** Assess detailed errors of best performing model
do "$maindir/Code/assess_errors_best.do"
* Outputs:
* Appendix Figure B.1: resids_hist_in.png;resids_hist_cv.png
* Appendix Figure B.2: mlresids_in.png; mlresids_cv.png
/* Appendix Figure B.3: mlerrors_blowerpre_cv.png; mlerrors_blowerpost_cv.png; 
	mlerrors_sqft_cv.png; mlerrors_famsize_cv.png; mlerrors_income_cv.png;
	mlerrors_agency_cv.png; mlerrors_months_cv.png; mlerrors_meantemp_cv.png */
/* Appendix Figure B.4: mlerrors_AirSeal_cv.png; mlerrors_AirCon_cv.png;
	mlerrors_Attic_cv.png; mlerrors_Baseload_cv.png; mlerrors_Door_cv.png;
	mlerrors_Foundation_cv.png; mlerrors_Furnace_cv.png; mlerrors_General_cv.png;
	mlerrors_HealSfty_cv.png; mlerrors_WallIns_cv.png; 
	mlerrors_WtHtr_cv.png; mlerrors_Window_cv.png */

*************************** END OF ML PERFORMANCE ******************************
********************************************************************************




********************************************************************************
********************** AVERAGE EFFECTS OF THE PROGRAM **************************

***** Estimating ATTs according to fixed effects models and machine learning
do "$maindir/Code/average_effects.do"
* Outputs:
* Figure 2: ML_trueVSpredict.pdf
* Appendix Table C.1: ATT_fulltable.tex
* Appendix Table C.2: ATT_fulltable_levels.tex
* Appendix Table C.3: ATT_gasVSelec.tex
* Appendix Figure C.1: ATT_paralleltrends.png 

******************* END OF AVERAGE EFFECTS OF THE PROGRAM **********************
********************************************************************************




********************************************************************************
************* WEDGE HETEROGENEITY BY UPGRADE-SPECIFIC SPENDING *****************

***** Regression to decompose the wedge
do "$maindir/Code/decompose_wedge.do"
* Outputs:
* Table 1: simul_gap_results.tex; simul_gap_pctresults.tex
* To be used for graphs decomposing the wedge: decompose_wedge.dta; histogram_data.dta

*** Plot results
do "$maindir/Code/wedgebymeasures_graphs.do"
* Outputs:
/* Figure 3: wedge_furnace_overlay.pdf; wedge_windows_overlay.pdf; 
	wedge_airseal_overlay.pdf; wedge_attic_overlay.pdf;
	wedge_wallins_overlay.pdf; wedge_graph_legend.png
*/

********** END OF WEDGE HETEROGENEITY BY UPGRADE-SPECIFIC SPENDING *************
********************************************************************************




********************************************************************************
**************************** CONTRACTOR QUALITY ********************************

***** Analyses of contractor quality, from Section 4.2 of the paper
do "$maindir/Code/contractor_quality.do"
* Outputs:
* Table 2: contqual_1ststage_save.tex; oster_combined_results.tex
* Table 3: simulations_contractors.tex; simulations_contractors_dropinter.tex
* Appendix Figure E.1: savpcthist_AirSeal_conthomes.png; savpcthist_AirSeal_contquint1.png; savpcthist_AirSeal_contquint5.png
* Appendix Figure E.2: savpcthist_Attic_conthomes.png; savpcthist_Attic_contquint1.png; savpcthist_Attic_contquint5.png
* Appendix Figure E.3: savpcthist_Furnace_conthomes.png; savpcthist_Furnace_contquint1.png; savpcthist_Furnace_contquint5.png
* Appendix Figure E.4: savpcthist_WallIns_conthomes.png; savpcthist_WallIns_contquint1.png; savpcthist_WallIns_contquint5.png
* Appendix Figure E.5: savpcthist_Window_conthomes.png; savpcthist_Window_contquint1.png; savpcthist_Window_contquint5.png
* Appendix Figure E.6: wedge_wallins_contqual1.png; wedge_wallins_contqual3.png; wedge_wallins_contqual5.png

************************ END OF CONTRACTOR QUALITY *****************************
********************************************************************************




********************************************************************************
************************** BEHAVIORAL RESPONSES ********************************

***** Analyses of how behavior changes can affect the wedge, from Section 4.3 of the paper
do "$maindir/Code/behavioral_responses.do"
* Outputs:
* Figure 4: tavgbin_Gas_raw.pdf; hddbest_prepost_controls.pdf
/* Table 4: new_reboundsave_results.tex; new_reboundsave_pctresults.tex;
		new_rebound_results.tex; new_rebound_pctresults.tex */
* Appendix Figure F.1: hddbest_prepost.pdf
* Results from Appendix F.2: appendixf2.tex

*** Effects of other behavioral factors
do "$maindir/Code/behavior_misc.do"
* Outputs:
* Appendix Table F.1: gapreg_behavior.tex
* Appendix Figure F.2: corr_hdd_rsq_pre.png
* Appendix Figure F.3: corr_hdd_rsq_post.png

*********************** END OF BEHAVIORAL RESPONSES ****************************
********************************************************************************




********************************************************************************
************************* COST-BENEFIT ANALYSES ********************************

***** Build dataset to be used in cost-benefit analyses
do "$maindir/Code/build_cba.do"
* Outputs: cba_data.dta (large file, about 8.5 GB)

*** Home-specific cost-benefit analyses
do "$maindir/Code/cba_perhome.do"
* Outputs: cba_scc.dta; cba_retail.dta

*** Produce main graphs and tables from CBA analyses
do "$maindir/Code/cba_results.do"
* Outputs:
* Figure 5: npb_homerank_12month_scc.pdf; npb_homerank_12month_retail.pdf
* Appendix Figure G.1: npb_homerank_fullsamp_scc.png
* Appendix Figure G.2: npb_homerank_prism_scc.png
* Appendix Table G.4: cba_byPY_scc.tex; cba_byPY_retail.tex
* Appendix Table G.5: costperco2_scc.tex

*** CBA tables from the Appendix
do "$maindir/Code/cba_apptables.do"
* Outputs:
/* Appendix Table G.1: cbatable_year10_scc.tex; cbatable_year20_scc.tex; 
	cbatable_year30_scc.tex; cbatable_year40_scc.tex; cbatable_disc0_scc.tex;
	cbatable_disc6_scc.tex; cbatable_year10_retail.tex; cbatable_year20_retail.tex;
	cbatable_year30_retail.tex; cbatable_year40_retail.tex; 
	cbatable_disc0_retail.tex; cbatable_disc6_retail.tex */
* Appendix Table G.2: cbatable_year30_scc.tex; cbatable_year30_retail.tex

*** Miscellanous results from CBA
do "$maindir/Code/cba_misc.do"
* Outputs:
* Appendix Table G.3: ttests_nonzerospending_year30.tex; ttests_save_year30.tex
* Appendix Figure G.3: gap_histogram_year30.png

********************** END OF COST-BENEFIT ANALYSES ****************************
********************************************************************************

display "Code ended running at $S_TIME ; $S_DATE"

